library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(countrycode)
df <- read_csv("survey_results_public.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## Respondent = col_double(),
## Age = col_double(),
## CompTotal = col_double(),
## ConvertedComp = col_double(),
## WorkWeekHrs = col_double()
## )
## ℹ Use `spec()` for the full column specifications.
DFF <- select(df, MainBranch, Country,Gender,Age, YearsCode, YearsCodePro ,EdLevel,Employment, JobSat, OrgSize, WorkWeekHrs, NEWOvertime, NEWOnboardGood, JobSeek, NEWLearn, LanguageWorkedWith,PlatformWorkedWith)
DF_USA <- DFF[DFF$Country == "United States" & !is.na(DFF$Country) & DFF$MainBranch == "I am a developer by profession" & !is.na(DFF$MainBranch), ][3:17]
DF_LC <- cbind(region = as.character(countrycode(sourcevar = DFF$Country,
origin = "country.name",
destination = "region")), DFF)
## Warning in countrycode(sourcevar = DFF$Country, origin = "country.name", : Some values were not matched unambiguously: Nomadic
DF_LC <- DF_LC[DF_LC$region == "Latin America & Caribbean" & !is.na(DF_LC$region) & DFF$MainBranch == "I am a developer by profession" & !is.na(DFF$MainBranch), ][, 4:18]
DF_MIX <- cbind(region = as.character(countrycode(sourcevar = DFF$Country,
origin = "country.name",
destination = "region")), DFF)
## Warning in countrycode(sourcevar = DFF$Country, origin = "country.name", : Some values were not matched unambiguously: Nomadic
DF_MIX <- DF_MIX[((DF_MIX$region == "Latin America & Caribbean" & !is.na(DF_MIX$region)) | (DF_MIX$Country == "United States" & !is.na(DF_MIX$Country))) & DF_MIX$MainBranch == "I am a developer by profession" & !is.na(DF_MIX$MainBranch), ][, 3:18]
DF_MIX$Country[DF_MIX$Country != "United States"] <- "América Latina"
DF_MIX$Country[DF_MIX$Country == "United States"] <- "Estados Unidos"
colnames(DF_MIX)[which(names(DF_MIX) == "Country")] <- "Region"
DF_MIX
| Pregunta | Variable | Tipo de Variable |
|---|---|---|
| Which of the following describe you, if any? Please check all that apply. If you prefer not to answer, you may leave this question blank. | Genero | Nominal |
| What is your age (in years)? If you prefer not to answer, you may leave this question blank. | Edad | Discreta |
| Including any education, how many years have you been coding in total? | Años codeando | Discreta |
temp = DF_USA$Gender[!is.na(DF_USA$Gender)]
gender_vec_USA <- vector()
gender_vec_LC <- vector()
for (pal in temp) {
for (sub_pal in strsplit(pal, ";")) {
gender_vec_USA <- c(gender_vec_USA, sub_pal)
}
}
temp = DF_LC$Gender[!is.na(DF_LC$Gender)]
gender_vec <- vector()
for (pal in temp) {
for (sub_pal in strsplit(pal, ";")) {
gender_vec_LC <- c(gender_vec_LC, sub_pal)
}
}
layout(matrix(c(1,2,3,3), ncol=2, byrow=TRUE), heights=c(6, 1))
par(mai=rep(0.5, 4))
temp = table(gender_vec_USA)
man <- round(temp[1]/length(gender_vec_USA)*100,2)
other <- round(temp[2]/length(gender_vec_USA)*100,2)
woman <- round(temp[3]/length(gender_vec_USA)*100,2)
pie(table(gender_vec_USA), main = "Estados Unidos", labels = c(paste(man, "%", sep = ""), paste(other, "%", sep = ""), paste(woman, "%", sep = "")), col = c("skyblue","orange","red"))
temp = table(gender_vec_LC)
man <- round(temp[1]/length(gender_vec_LC)*100,2)
other <- round(temp[2]/length(gender_vec_LC)*100,2)
woman <- round(temp[3]/length(gender_vec_LC)*100,2)
pie(table(gender_vec_LC), main = "América Latina", labels = c(paste(man, "%", sep = ""), paste(other, "%", sep = ""), paste(woman, "%", sep = "")), col = c("skyblue","orange","red"))
par(mai=c(0,0,0,0))
plot.new()
legend(x="center", ncol=3,legend=c("Hombre","Otros","Mujer"),
fill=c("skyblue","orange","red"))
En el gráfico podemos ver como si bien, la proporción de mujeres que se dedican en Estados Unidos es el doble que la proporción en América Latina, en ambas regiones la brecha de genero es bastante grande.
boxplot(DF_MIX$Age~DF_MIX$Region, na.rm= T, xlab = "Edad", ylab = "Region", notch=TRUE, horizontal = TRUE)
abline(v=mean(DF_USA$Age, na.rm = TRUE), col = "red", lwd=2)
abline(v=mean(DF_LC$Age, na.rm = TRUE), col = "blue",lwd=2)
legend(1.2,100,legend=c("media EEUU", "Media SyC"),col=c("red", "blue"),lwd=2)
Podemos observar que las edades de los programadores profesionales están más concentradas en América Latina comparando los rangos intercuartílicos:
IQR(DF_LC$Age, na.rm = TRUE)
## [1] 9
IQR(DF_USA$Age, na.rm = TRUE)
## [1] 12
Además, la edad promedio de los programadores profesionales de América Latina es de 30.14 mientras que el de Estados Unidos es de 34.33,
round(mean(DF_LC$Age, na.rm = TRUE), 2)
## [1] 30.14
round(mean(DF_USA$Age, na.rm = TRUE),2)
## [1] 34.33
Junto con las medianas y la forma de las cajas podemos concluir que en general, no hay mucha diferencia de edad entre los programadores profesionales de ambos países
round(median(DF_LC$Age, na.rm = TRUE), 2)
## [1] 28.5
round(median(DF_USA$Age, na.rm = TRUE),2)
## [1] 32
boxplot(as.numeric(DF_MIX$YearsCode)~DF_MIX$Region, na.rm= TRUE, xlab = "Region", ylab = "Años Codeando", notch = TRUE)
abline(h=mean(as.numeric(DF_USA$YearsCode), na.rm = TRUE), col = "red")
abline(h=mean(as.numeric(DF_LC$YearsCode), na.rm = TRUE), col = "blue")
legend(1.1,50,legend=c("media EEUU", "Media SyC"),col=c("red", "blue"),lwd=2)
En el diagrama de cajas y bigotes podemos ver como los programadores profesionales de Estados Unidos llevan mas años programando en total en comparación con los de América Latina. Sin embargo, esta diferencia es pequeña.
También, es importante destacar la gran diferencia de 6 años que existe entre el 3er cuartíl de ambas regiones. Esto implica que el 75% de los programadores profesionales de Estados Unidos cuentan a lo mucho con 22 años programando mientras que en América latina solo 16.
round(quantile(as.numeric(DF_LC$YearsCode), na.rm = TRUE), 2)
## 0% 25% 50% 75% 100%
## 1 7 10 16 50
round(quantile(as.numeric(DF_USA$YearsCode), na.rm = TRUE),2)
## 0% 25% 50% 75% 100%
## 1 8 14 22 50